{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import cPickle\n",
    "import numpy as np\n",
    "\n",
    "from keras import backend\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.convolutional import Convolution2D, MaxPooling2D\n",
    "from keras.optimizers import Adadelta\n",
    "from keras.constraints import unitnorm\n",
    "from keras.regularizers import l2\n",
    "\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load train, validation and test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_idx_from_sent(sent, word_idx_map, max_l=51, kernel_size=5):\n",
    "    \"\"\"\n",
    "    Transforms sentence into a list of indices. Pad with zeroes.\n",
    "    \"\"\"\n",
    "    x = []\n",
    "    pad = kernel_size - 1\n",
    "    for i in xrange(pad):\n",
    "        x.append(0)\n",
    "    words = sent.split()\n",
    "    for word in words:\n",
    "        if word in word_idx_map:\n",
    "            x.append(word_idx_map[word])\n",
    "    while len(x) < max_l+2*pad:\n",
    "        x.append(0)\n",
    "    return x\n",
    "\n",
    "def make_idx_data(revs, word_idx_map, max_l=51, kernel_size=5):\n",
    "    \"\"\"\n",
    "    Transforms sentences into a 2-d matrix.\n",
    "    \"\"\"\n",
    "    train, val, test = [], [], []\n",
    "    for rev in revs:\n",
    "        sent = get_idx_from_sent(rev['text'], word_idx_map, max_l, kernel_size)\n",
    "        sent.append(rev['y'])\n",
    "        if rev['split'] == 1:\n",
    "            train.append(sent)\n",
    "        elif rev['split'] == 0:\n",
    "            val.append(sent)\n",
    "        else:\n",
    "            test.append(sent)\n",
    "    train = np.array(train, dtype=np.int)\n",
    "    val = np.array(val, dtype=np.int)\n",
    "    test = np.array(test, dtype=np.int)\n",
    "    return [train, val, test]\n",
    "\n",
    "\n",
    "print \"loading data...\"\n",
    "x = cPickle.load(open(\"imdb-train-val-test.pickle\", \"rb\"))\n",
    "revs, W, word_idx_map, vocab = x[0], x[1], x[2], x[3]\n",
    "print \"data loaded!\"\n",
    "\n",
    "\n",
    "datasets = make_idx_data(revs, word_idx_map, max_l=2633, kernel_size=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "Put train data in separate NumPy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Train data preparation\n",
    "N = datasets[0].shape[0]\n",
    "conv_input_width = W.shape[1]\n",
    "conv_input_height = int(datasets[0].shape[1]-1)\n",
    "\n",
    "# For each word write a word index (not vector) to X tensor\n",
    "train_X = np.zeros((N, conv_input_height), dtype=np.int)\n",
    "train_Y = np.zeros((N, 2), dtype=np.int)\n",
    "for i in xrange(N):\n",
    "    for j in xrange(conv_input_height):\n",
    "        train_X[i, j] = datasets[0][i, j]\n",
    "    train_Y[i, datasets[0][i, -1]] = 1\n",
    "    \n",
    "print 'train_X.shape = {}'.format(train_X.shape)\n",
    "print 'train_Y.shape = {}'.format(train_Y.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Put validation data in separate NumPy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Validation data preparation\n",
    "Nv = datasets[1].shape[0]\n",
    "\n",
    "# For each word write a word index (not vector) to X tensor\n",
    "val_X = np.zeros((Nv, conv_input_height), dtype=np.int)\n",
    "val_Y = np.zeros((Nv, 2), dtype=np.int)\n",
    "for i in xrange(Nv):\n",
    "    for j in xrange(conv_input_height):\n",
    "        val_X[i, j] = datasets[1][i, j]\n",
    "    val_Y[i, datasets[1][i, -1]] = 1\n",
    "    \n",
    "print 'val_X.shape = {}'.format(val_X.shape)\n",
    "print 'val_Y.shape = {}'.format(val_Y.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's define and compile CNN model with Keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "backend.set_image_dim_ordering('th')\n",
    "\n",
    "# Number of feature maps (outputs of convolutional layer)\n",
    "N_fm = 300\n",
    "# kernel size of convolutional layer\n",
    "kernel_size = 8\n",
    "\n",
    "model = Sequential()\n",
    "# Embedding layer (lookup table of trainable word vectors)\n",
    "model.add(Embedding(input_dim=W.shape[0], \n",
    "                    output_dim=W.shape[1], \n",
    "                    input_length=conv_input_height,\n",
    "                    weights=[W], \n",
    "                    W_constraint=unitnorm()))\n",
    "# Reshape word vectors from Embedding to tensor format suitable for Convolutional layer\n",
    "model.add(Reshape((1, conv_input_height, conv_input_width)))\n",
    "\n",
    "# first convolutional layer\n",
    "model.add(Convolution2D(N_fm, \n",
    "                        kernel_size, \n",
    "                        conv_input_width, \n",
    "                        border_mode='valid', \n",
    "                        W_regularizer=l2(0.0001)))\n",
    "# ReLU activation\n",
    "model.add(Activation('relu'))\n",
    "\n",
    "# aggregate data in every feature map to scalar using MAX operation\n",
    "model.add(MaxPooling2D(pool_size=(conv_input_height-kernel_size+1, 1)))\n",
    "\n",
    "model.add(Flatten())\n",
    "model.add(Dropout(0.5))\n",
    "# Inner Product layer (as in regular neural network, but without non-linear activation function)\n",
    "model.add(Dense(2))\n",
    "# SoftMax activation; actually, Dense+SoftMax works as Multinomial Logistic Regression\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "# Custom optimizers could be used, though right now standard adadelta is employed\n",
    "opt = Adadelta(lr=1.0, rho=0.95, epsilon=1e-6)\n",
    "model.compile(loss='categorical_crossentropy', \n",
    "              optimizer=opt,\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "epoch = 0\n",
    "val_acc = []\n",
    "val_auc = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train model for N_epoch epochs (could be run as many times as needed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "N_epoch = 3\n",
    "\n",
    "for i in xrange(N_epoch):\n",
    "    model.fit(train_X, train_Y, batch_size=50, nb_epoch=1, verbose=1)\n",
    "    output = model.predict_proba(val_X, batch_size=10, verbose=1)\n",
    "    # find validation accuracy using the best threshold value t\n",
    "    vacc = np.max([np.sum((output[:,1]>t)==(val_Y[:,1]>0.5))*1.0/len(output) for t in np.arange(0.0, 1.0, 0.01)])\n",
    "    # find validation AUC\n",
    "    vauc = roc_auc_score(val_Y, output)\n",
    "    val_acc.append(vacc)\n",
    "    val_auc.append(vauc)\n",
    "    print 'Epoch {}: validation accuracy = {:.3%}, validation AUC = {:.3%}'.format(epoch, vacc, vauc)\n",
    "    epoch += 1\n",
    "    \n",
    "print '{} epochs passed'.format(epoch)\n",
    "print 'Accuracy on validation dataset:'\n",
    "print val_acc\n",
    "print 'AUC on validation dataset:'\n",
    "print val_auc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "Save model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model.save_weights('cnn_3epochs.model')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "Put test data in separate NumPy array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Test data preparation\n",
    "Nt = datasets[2].shape[0]\n",
    "\n",
    "# For each word write a word index (not vector) to X tensor\n",
    "test_X = np.zeros((Nt, conv_input_height), dtype=np.int)\n",
    "for i in xrange(Nt):\n",
    "    for j in xrange(conv_input_height):\n",
    "        test_X[i, j] = datasets[2][i, j]\n",
    "    \n",
    "print 'test_X.shape = {}'.format(test_X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "p = model.predict_proba(test_X, batch_size=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prepare submission file for Kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv('DATA/testData.tsv', sep='\\t')\n",
    "d = pd.DataFrame({'id': data['id'], 'sentiment': p[:,0]})\n",
    "d.to_csv('cnn_3epochs.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
